In [1]:
# Import Packages
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
In [2]:
# Read data from files
df_train = pd.read_csv("2022_train.csv")
df_test = pd.read_csv("2022_test.csv")
df_test2 = pd.read_csv("2022_test.csv")
df_train.head()
Out[2]:
Id GP MIN PTS FGM FGA FG% 3P Made 3PA 3P% ... FTA FT% OREB DREB REB AST STL BLK TOV TARGET_5Yrs
0 3799 80 24.3 7.8 3.0 6.4 45.7 0.1 0.3 22.6 ... 2.9 72.1 2.2 2.0 3.8 3.2 1.1 0.2 1.6 1
1 3800 75 21.8 10.5 4.2 7.9 55.1 -0.3 -1.0 34.9 ... 3.6 67.8 3.6 3.7 6.6 0.7 0.5 0.6 1.4 1
2 3801 85 19.1 4.5 1.9 4.5 42.8 0.4 1.2 34.3 ... 0.6 75.7 0.6 1.8 2.4 0.8 0.4 0.2 0.6 1
3 3802 63 19.1 8.2 3.5 6.7 52.5 0.3 0.8 23.7 ... 1.5 66.9 0.8 2.0 3.0 1.8 0.4 0.1 1.9 1
4 3803 63 17.8 3.7 1.7 3.4 50.8 0.5 1.4 13.7 ... 0.5 54.0 2.4 2.7 4.9 0.4 0.4 0.6 0.7 1

5 rows × 21 columns

In [3]:
df_test.head()
Out[3]:
Id GP MIN PTS FGM FGA FG% 3P Made 3PA 3P% FTM FTA FT% OREB DREB REB AST STL BLK TOV
0 0 56 9.1 4.0 1.6 3.7 43.7 0.1 0.3 7.3 0.7 1.2 63.4 1.2 0.8 1.7 0.4 0.2 0.3 0.8
1 1 43 19.3 10.1 3.7 8.1 46.0 0.6 1.7 35.1 1.8 2.5 75.3 0.5 0.9 1.5 3.5 0.6 0.0 1.8
2 2 82 33.9 11.3 4.9 10.6 45.6 0.5 1.9 44.8 1.8 2.7 71.2 1.3 3.3 4.5 2.5 1.3 0.3 2.0
3 3 86 44.7 18.8 6.8 15.9 42.9 0.5 1.8 13.5 4.5 6.3 70.9 1.5 3.2 5.0 4.1 0.9 0.1 3.6
4 4 58 12.3 4.7 1.6 4.0 40.0 0.5 1.7 38.7 1.1 1.3 76.9 0.2 0.6 0.9 1.5 0.5 -0.4 0.9
In [4]:
# Perform some EDA
ProfileReport(df_train, title="EDA of training data")
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[4]:

In [5]:
# Feature Engineering
# Drop columns and cube root the AST
df_train_short = df_train.drop(['GP', 'MIN', 'PTS', 'FGM', 'FGA', '3P Made', '3PA', 'FTM', 'FTA', 'OREB', 'DREB', 'TOV', 'BLK', 'STL', 'REB'], axis=1)
df_train_short['AST TX'] = df_train_short['AST']**(1/3)
df_train_short.drop(['Id', 'AST'], axis = 1, inplace= True)
In [6]:
# Create target variable y from dataset and X
y = df_train_short.pop('TARGET_5Yrs')
X = df_train_short
In [7]:
# Scale the X values
scaler = StandardScaler()
X = scaler.fit_transform(X)
In [8]:
# Split into training and validation datasets at 80% training. Stratified data as well
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, stratify=y)
In [9]:
# Define model and predict the values
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)

y_val_pred_knn = knn.fit(X_train, y_train).predict(X_val)
In [10]:
# Print the AUC score
auc_score3 = roc_auc_score(y_val, y_val_pred_knn)
print(auc_score3)
0.5505198528555271
In [11]:
# Now to try it on test data

df_test_pred = df_test.drop(['GP', 'MIN', 'PTS', 'FGM', 'FGA', '3P Made', '3PA', 'FTM', 'FTA', 'OREB', 'DREB', 'TOV', 'BLK', 'STL', 'REB'], axis=1)
df_test_pred['AST TX'] = df_test_pred['AST']**(1/3)
df_test_pred.drop(['Id', 'AST'], axis = 1, inplace= True)
In [15]:
# Predict the probabilities, then add it to a dataframe with the IDs from the test set
y_test_predictions = knn.fit(X_train, y_train).predict_proba(df_test_pred)
probabilities = y_test_predictions[:,1]
final = pd.DataFrame({'Id':df_test.Id, 'TARGET_5Yrs':probabilities})
final
Out[15]:
Id TARGET_5Yrs
0 0 1.0
1 1 1.0
2 2 1.0
3 3 1.0
4 4 1.0
... ... ...
3794 3794 1.0
3795 3795 1.0
3796 3796 1.0
3797 3797 1.0
3798 3798 1.0

3799 rows × 2 columns

In [17]:
# Save to CSV for upload to Kaggle
final.to_csv('2022_timwang_firsttry.csv', index = False)
In [ ]:
 
In [ ]: